A project for CAPP 30239 using Altair
Eric Langowski
import altair as alt
import pandas as pd
import geopandas as gpd
import json
import gpd_lite_toolbox as glt
from gpd_lite_toolbox.utils_carto import m_plot_dataframe
import numpy as np
#from google.colab import drive
#drive.mount('/content/gdrive')
#file_path = '/content/gdrive/My Drive/colab data/'
file_path = "C:/Users/erhla/Desktop/data viz/hmwk 2/"
# define the theme by returning the dictionary of configurations
def my_theme():
return {
'config': {
'view': {
'height': 300,
'width': 500,
},
'title': {
'anchor': 'start',
'color': '#000000',
'font': 'Benton Gothic Bold, sans-serif',
'fontSize': 22,
'fontWeight': 'bold'
},
'point': {
'filled': True,
'shape': 'circle'
},
'background': '#f0f0f0',
'axis': {
'tickSize': 10,
'titleFontSize': 14,
'titlePadding': 10,
'labelPadding': 4,
'labelFontSize': 10
},
'legend': {
'labelFontSize': 11,
'padding': 1,
'symbolSize': 60,
'symbolType': 'square',
'titleFontSize': 14,
'titlePadding': 10
}
}
}
# register the custom theme under a chosen name
alt.themes.register('my_theme', my_theme)
# enable the newly registered theme
alt.themes.enable('my_theme')
#load data
transit_data = pd.read_csv(file_path + "Chicago_Traffic_Tracker_-_Historical_Congestion_Estimates_by_Region_-_2018-Current.csv")
tract_times = pd.read_csv(file_path +"tract_drive_times_cook_csds.csv")
snow_data = pd.read_csv(file_path + "chicago_snow.csv")
chicago_data = gpd.read_file(file_path + 'chicago.geojson')
ca_data = gpd.read_file(file_path + 'community_areas.geojson')
#process data
chicago_tract_times = tract_times[(tract_times["origin"].isin(chicago_data["GEOID"])) & (tract_times["destination"].isin(chicago_data["GEOID"]))]
tract_avg = chicago_tract_times.groupby('origin')['minutes'].mean().reset_index()
choro_json = json.loads(chicago_data.to_json())
choro_data = alt.Data(values=choro_json['features'])
choro_json_ca = json.loads(ca_data.to_json())
choro_data_ca = alt.Data(values=choro_json_ca['features'])
rushhour_data = transit_data[(transit_data["HOUR"].isin([7, 8, 9, 16, 17, 18])) & (transit_data["DAY_OF_WEEK"].isin([2,3,4,5,6]))]
rushhour_data["TIME"] = pd.to_datetime(rushhour_data["TIME"])
rushhour_data["YEAR"] = rushhour_data["TIME"].dt.year
rushhour_data["WEEK"] = rushhour_data["TIME"].dt.week
rushhour_data["DAY"] = rushhour_data["TIME"].dt.day
rushhour_data["round_day"] = rushhour_data.TIME.dt.round('D', 'shift_backward')
first_five_weeks = rushhour_data[(rushhour_data["MONTH"].isin([1,2]))]
mini_five_weeks = first_five_weeks[first_five_weeks["WEEK"].isin([2,3,4,5])]
#make plot data
plot_1_data = mini_five_weeks.groupby(["REGION", "YEAR"])["SPEED"].mean().reset_index()
plot_1_data['pct_change'] = -1 * plot_1_data["SPEED"].pct_change()
plot_1_data = plot_1_data[plot_1_data["YEAR"] == 2020]
plot_1_data.sort_values(["SPEED"], inplace=True)
plot_1_data = plot_1_data[plot_1_data["SPEED"] < 30]
plot_2_data = rushhour_data[(rushhour_data["TIME"] >= '01-01-2019') &
(rushhour_data["TIME"] <= '12-31-2019') &
(rushhour_data["REGION"].isin(['Lincoln Park-Lake View', 'Near North', 'Chicago Loop']))]
plot_2_data = plot_2_data.groupby(['REGION', 'round_day'])['SPEED'].mean().reset_index()
day_averages = mini_five_weeks.groupby(['YEAR', 'MONTH', 'WEEK', 'DAY'])['SPEED'].mean().reset_index()
day_averages['DATE'] = day_averages['MONTH'].astype(str) + '/' + day_averages['DAY'].astype(str) + '/' + day_averages['YEAR'].astype(str)
day_snow_avg = day_averages.merge(snow_data, on='DATE')
new = day_snow_avg.groupby(['YEAR', 'DATE'])['SNOW'].sum().reset_index()
new['avg_speed'] = day_snow_avg.groupby(['YEAR', 'DATE'])['SPEED'].mean().reset_index()['SPEED']
new = new[new['SNOW'] > 0 ]
loop_only = rushhour_data[rushhour_data["REGION"] == "Chicago Loop"]
plot_3_data = loop_only.groupby(["YEAR", "WEEK"])['SPEED'].mean().reset_index()
plot_3_data["first_day"] = loop_only.groupby(["YEAR", "WEEK"])['TIME'].min().reset_index()["TIME"].dt.round('D').astype(str)
plot_3_data = plot_3_data[plot_3_data["first_day"] > '2019-11-01']
plot_1 = alt.Chart(plot_1_data).mark_bar().encode(
y=alt.Y('REGION:O',sort='x', title="Community Area"),
x=alt.X('pct_change:Q', title="2019 to 2020 change", axis=alt.Axis(format='%')),
color=alt.condition(
alt.datum.pct_change > 0,
alt.value("steelblue"),
alt.value("orange"))
).properties(
title = {"text": ["Congestion has uniformally decreased in 2020..."],
"subtitle": ["Comparing rush hours in the first five working weeks of 2020 and 2019",
"Source: City of Chicago Open Data, kf7e-cur8"],},
height = 500
)
plot_1
The data from this chart derives from the speed CTA buses travel Chicago's streets. There is a lot of variance in Chicago's neighborhoods, but this plot shows that all neighborhoods have gotten less congested this year. The sample is very small at this point (only from 5 weeks) so as the quarter progresses these results should become more substantive.
plot_2 = alt.Chart(new).mark_circle(size=500).encode(
x=alt.X('SNOW:Q',title='Daily Total Snowfall'),
y=alt.Y('avg_speed:Q',title='Average Speed'),
color=alt.condition(
alt.datum.YEAR == 2019,
alt.value("orange"),
alt.value("steelblue"))
).properties(
title = {"text": ["...except 2019 had the Polar Vortex"],
"subtitle": ["Days with measurable snow in the first five working weeks of 2020 (3) and 2019 (9)",
"Source: City of Chicago Open Data, kf7e-cur8 & NOAA Climate Data Online Tool"],}
)
plot_2
Most days had no snow and were excluded. There were nine days with measurable snowfall in 2019 and three in 2020.
target_data = rushhour_data[rushhour_data['YEAR'].isin([2019])]
target_data = target_data[target_data['round_day'] < '2019-05-01']
day_region = target_data.groupby(['round_day', 'HOUR'])['SPEED'].mean().reset_index()
base = alt.Chart(day_region).mark_circle(size=75).encode(
x=alt.X('round_day:T',title='Date'),
y=alt.Y('SPEED:Q',title='Average Speed'),
color = alt.Color('SPEED:Q', title="Speed",scale=alt.Scale(scheme='magma')
)).properties(
title = {"text": ["How bad was the polar vortex?"],
"subtitle": ["The two week period from 1/24/2019 to 2/2/2019 accounted for about half of 2019's slowest commutes",
"Source: City of Chicago Open Data, kf7e-cur8"],}
)
annotation = alt.Chart(pd.DataFrame(data={'x': ['01-24-2019'], 'x2': ['02-02-2019'], 'y': [0], 'y2': [30]})
).mark_rect(opacity=0.4
).encode(
y='y',
y2='y2',
x='x:T',
x2='x2:T'
)
base + annotation
The commute on most days is pretty uniform, except for late January 2019. These weather affected commutes were among the slowest commutes in the last three years.
plot_4 = alt.Chart(plot_3_data).mark_bar().encode(
x=alt.X("first_day:O", title="Week of"),
y=alt.Y("SPEED:Q", title="Average Speed"),
color=alt.condition(
alt.datum.first_day >= '2020-01-01',
alt.value("steelblue"),
alt.value("orange"))
).properties(
title = {"text": ["Congestion in the Loop is Unchanged"],
"subtitle": ["New rideshare tax was implemented on Jan 1, 2020", "Source: City of Chicago Open Data, kf7e-cur8"]})
plot_4
With the implementation of the tax on January 1, 2020, this plot is searching for a regression discontinuity (a sharp change in congestion). With the present limited sample size, there is not a lot to be said from this figure yet.
plot_3 = alt.Chart(plot_2_data).mark_area(opacity=0.5, interpolate='step').encode(
x=alt.X('SPEED:Q', bin=alt.Bin(maxbins=100), title="Average Speed", axis=alt.Axis(values=[10, 15, 20])),
y=alt.Y('count()', stack=None, title="Count"),
color='REGION:N').properties(
title = {"text": ["Chicago's Three Most Congested Neighborhoods"],
"subtitle": ["From commutes in 2019", "Source: City of Chicago Open Data, kf7e-cur8"],})
plot_3
This plot shows congestion in the three most congested neighborhoods, including extremely good and bad commutes in the tails. These three neighborhoods were targeted by the Mayor in the new rideshare tax as highly congested areas. The tax is designed to encourage rideshare users in these areas to take public transit.
base = alt.Chart(choro_data_ca).mark_geoshape(stroke='black',
strokeWidth=0.5, fill='white').properties(
width=300,
)
drive = alt.Chart(choro_data, title="Driving to Work").mark_geoshape(stroke='black',
strokeWidth=0.5).encode(
alt.Color('properties.pct_drive:Q', title = "Percent", scale=alt.Scale(scheme='lightmulti'))
).properties(
width=300,
)
transit = alt.Chart(choro_data, title="Taking Transit to Work").mark_geoshape(stroke='black',
strokeWidth=0.5).encode(
color='properties.pct_transit:Q'
).properties(
width=300,
)
bicycle = alt.Chart(choro_data, title="Walking or Biking to Work").mark_geoshape(stroke='black',
strokeWidth=0.5).encode(
color=alt.Color('properties.pct_bicyclewalk:Q')
).properties(
width=300,
)
drive_points = alt.Chart(choro_data_ca).mark_circle().encode(
longitude='properties.Longitude:Q',
latitude='properties.Latitude:Q',
size=alt.Size('properties.pct_drive:Q',legend=None),
color = 'properties.pct_drive:Q'
)
transit_points = alt.Chart(choro_data_ca).mark_circle().encode(
longitude='properties.Longitude:Q',
latitude='properties.Latitude:Q',
size=alt.Size('properties.pct_transit:Q',legend=None),
color = 'properties.pct_transit:Q'
)
bicycle_points = alt.Chart(choro_data_ca).mark_circle().encode(
longitude='properties.Longitude:Q',
latitude='properties.Latitude:Q',
size=alt.Size('properties.pct_bicyclewalk:Q',legend=None),
color = 'properties.pct_bicyclewalk:Q'
)
main = (drive | base + drive_points) & (transit | base + transit_points) & (bicycle | base + bicycle_points)
main.properties(
title = {"text": ["How do Chicagoans get to work? Mostly by car."],
"subtitle": ["Source: American Community Survey", ""],})
This series of side-by-side maps shows how Chicagoans get to work by census tract. On the left, tracts are shaded by value and on the right, tracts are aggregated into community areas which then receive an appropriately colored/sized circle.
ten = alt.Chart(choro_data_ca, title="Commute Less than 10 minutes").mark_geoshape(stroke='black',
strokeWidth=0.5).encode(
alt.Color('properties.pct_commute_less_than_ten:Q', scale=alt.Scale(scheme='lightmulti'), title = "Percent")
).properties(
width=300,
height=300
)
twenty = alt.Chart(choro_data_ca, title="Commute 10 to 20 minutes").mark_geoshape(stroke='black',
strokeWidth=0.5).encode(
color = 'properties.pct_commute_ten_twenty:Q'
).properties(
width=300,
height=300
)
thirty = alt.Chart(choro_data_ca, title="Commute 20 to 30 minutes").mark_geoshape(stroke='black',
strokeWidth=0.5).encode(
color = 'properties.pct_commute_twenty_thirty:Q'
).properties(
width=300,
height=300
)
forty = alt.Chart(choro_data_ca, title="Commute 30 to 40 minutes").mark_geoshape(stroke='black',
strokeWidth=0.5).encode(
color = 'properties.pct_commute_thirty_forty:Q'
).properties(
width=300,
height=300
)
fifty = alt.Chart(choro_data_ca, title="Commute 40 to 50 minutes").mark_geoshape(stroke='black',
strokeWidth=0.5).encode(
color = 'properties.pct_commute_forty_fifty:Q'
).properties(
width=300,
height=300
)
plus = alt.Chart(choro_data_ca, title="Commute 50+ minutes").mark_geoshape(stroke='black',
strokeWidth=0.5).encode(
color = 'properties.pct_commute_fifty_plus:Q'
).properties(
width=300,
height=300
)
main2 = (ten | twenty) & (thirty | forty) & (fifty | plus)
main2.properties(
title = {"text": ["The Far South Side has the longest commutes on average"],
"subtitle": ["Source: American Community Survey", ""],})
This series of plots shows how long it takes Chicagoans to get to work.
alt.Chart(choro_data).mark_geoshape(stroke='grey').encode(
alt.Color('minutes:Q', scale=alt.Scale(scheme = "lightmulti"), title="Minutes")
).transform_lookup(
lookup='properties.GEOID',
from_=alt.LookupData(tract_avg, 'origin', ['minutes'])).properties(
width=750,
height=750
).properties(
title = {"text": ["Driving 'Connectivity'"],
"subtitle": ["Average time to drive to every other Chicago census tract",
"Source: Center for Spatial Data Science"],}
)
#make grid
#get all tracts once
mini_tracts = chicago_tract_times[chicago_tract_times['origin'] == 17031010100]
mini_tracts['GEOID'] = mini_tracts['destination']
mini_tracts = mini_tracts[['GEOID', 'minutes']]
#get tract centroids
centroid_data = gpd.GeoDataFrame(chicago_data['GEOID'].astype('int64'), geometry = chicago_data.centroid)
centroid_data = centroid_data.merge(mini_tracts, on='GEOID')
#get grid
firstgrid = glt.gridify_data(centroid_data, .05, 'minutes', cut=False, method=np.mean)
grid_to_tracts = gpd.sjoin(firstgrid, centroid_data)
#make data for each grid cell
geo_tracts = gpd.GeoDataFrame(chicago_data['GEOID'].astype('int64'), geometry = chicago_data.centroid)
ctt_dest_geo = geo_tracts.merge(chicago_tract_times, left_on="GEOID", right_on="destination")
grid_plot_dict = {}
sub_height = 105
sub_width = 80
#make non empty plots
for i in set(grid_to_tracts.index.values):
#filter and grid data
current_grid_data = grid_to_tracts[grid_to_tracts.index == i]
current_grid_tracts = ctt_dest_geo[ctt_dest_geo['origin'].isin(current_grid_data["GEOID"])]
gridded_data = glt.gridify_data(current_grid_tracts, .05, 'minutes', cut=False, method=np.mean)
gridded_data.minutes = gridded_data.minutes.replace({-1: np.NaN})
#plot data
alt_cur_grid = alt.Data(values=json.loads(gridded_data.to_json())['features'])
cur_grid_plot = alt.Chart(alt_cur_grid).mark_geoshape(
stroke='grey', strokeWidth=0.5
).encode(color = alt.Color('properties.minutes:Q', legend=None, scale=alt.Scale(scheme = "lightmulti"))
).properties(width=sub_width, height=sub_height)
grid_plot_dict[i] = cur_grid_plot
#make empty plots
annotation = alt.Chart(pd.DataFrame(data={'x': [-1], 'x2': [1], 'y': [-1], 'y2': [1]})).mark_rect(opacity=0.0
).encode(
y=alt.Y('y', title=None, axis=None),
y2='y2',
x=alt.X('x', title=None, axis=None),
x2='x2'
).properties(
width=sub_width, height=sub_height+1.9)
#functions to combine together from stackoverflow
def make_hcc(row_of_charts):
hconcat = [chart for chart in row_of_charts]
hcc = alt.VConcatChart(vconcat=hconcat, spacing=-0.5)
return hcc
def facet_wrap(charts, charts_per_row):
rows_of_charts = [
charts[i:i+charts_per_row]
for i in range(0, len(charts), charts_per_row)]
vconcat = [make_hcc(r) for r in rows_of_charts]
vcc = alt.HConcatChart(hconcat=vconcat, spacing=-0.5)\
.configure_axisX(grid=True)\
.configure_axisY(grid=True)
return vcc
plot_ls = [grid_plot_dict.get(i, annotation) for i in range(0,64)]
# assemble the facet grid
compound_chart = facet_wrap(plot_ls, charts_per_row=8)
compound_chart.properties(
title = {"text": ["Driving 'Connectivity'"],
"subtitle": ["An Origin-Destination Map for Chicago. Every 'cell' is a unique map for that location.",
"Source: Center for Spatial Data Science"],}
)
This plot is a "remake" of the plot before in the form of an origin-destination map. Every "cell" of the map corresponds to a map of the average drive time to and from that "cell". For example, the bottom left cell shows drive times from the bottom left of Chicago to the rest of Chicago. This draws from Wood 2010.
Jo Wood, Jason Dykes & Aidan Slingsby (2010) Visualisation of Origins, Destinations and Flows with OD Maps, The Cartographic Journal, 47:2, 117-129, DOI: 10.1179/000870410X12658023467367
This map utilizes data from the Center for Spatial Data Science. The center produced a data set which includes the average times it would take to drive from every census tract to every other census tract in Chicago. This map presents the average time to drive to all other census tracts, or the average amount of time it would take to get anywhere else in the city from a given tract.
I used this R code to pull census and geospatial info together since geopandas is insanity.
#setup
library(tidycensus)
library(tidyverse)
library(tigris)
library(sf)
library(geojsonio)
options(tigris_class = "sf")
options(tigris_use_cache = TRUE)
variable_ls <- c(
"B08301_001",
"B08301_002",
"B08301_010",
"B08301_011",
"B08301_013",
"B08301_014",
"B08301_018",
"B08301_019",
"B08303_001",
"B08303_002",
"B08303_003",
"B08303_004",
"B08303_005",
"B08303_006",
"B08303_007",
"B08303_008",
"B08303_009",
"B08303_010",
"B08303_011",
"B08303_012",
"B08303_013"
)
cur_tbl <- get_acs(
geography = "tract",
variables = variable_ls,
state = '17',
county = '031',
cache_table = TRUE,
output = "wide",
geometry = TRUE
)
cur_tbl <- cur_tbl[, -grep("\\dM", colnames(cur_tbl))]
cur_tbl <- cur_tbl %>% mutate(
pct_drive = B08301_002E / B08301_001E,
pct_transit = B08301_010E / B08301_001E,
pct_bus = B08301_011E / B08301_001E,
pct_train = (B08301_013E + B08301_014E) / B08301_001E,
pct_bicyclewalk = (B08301_018E + B08301_019E) / B08301_001E,
pct_other = 1 - (pct_drive + pct_transit + pct_bicyclewalk),
pct_commute_less_than_ten = (B08303_002E + B08303_003E) / B08303_001E,
pct_commute_ten_twenty = (B08303_004E + B08303_005E) / B08303_001E,
pct_commute_twenty_thirty = (B08303_006E + B08303_007E) / B08303_001E,
pct_commute_thirty_forty = (B08303_008E + B08303_009E) / B08303_001E,
pct_commute_forty_fifty = (B08303_010E + B08303_011E) / B08303_001E,
pct_commute_fifty_plus = 1 - (
pct_commute_less_than_ten + pct_commute_ten_twenty + pct_commute_twenty_thirty + pct_commute_thirty_forty + pct_commute_forty_fifty
)
)
#filter for chicago tracts only
chicago <-
read_sf(
"~/../Desktop/data viz/hmwk 2/geo_export_61c8e856-f8b3-433a-89b1-4b925f5ea983.shp"
)
cur_tbl <- cur_tbl %>% filter(GEOID %in% chicago$geoid10)
#add community areas
community_areas <-
read_sf(
"~/../Desktop/data viz/hmwk 2/geo_export_9aabff09-40d1-41ae-9ed6-52535c9be035.shp"
) %>% st_transform(st_crs(cur_tbl))
weighted_est <-
st_interpolate_aw(cur_tbl %>% select(-GEOID,-NAME), community_areas, extensive = TRUE)
weighted_est['community'] <- community_areas$community
weighted_est <-
weighted_est %>% mutate(
pct_drive = B08301_002E / B08301_001E,
pct_transit = B08301_010E / B08301_001E,
pct_bus = B08301_011E / B08301_001E,
pct_train = (B08301_013E + B08301_014E) /
B08301_001E,
pct_bicyclewalk = (B08301_018E + B08301_019E) /
B08301_001E,
pct_other = 1 - (pct_drive + pct_transit + pct_bicyclewalk),
pct_commute_less_than_ten = (B08303_002E + B08303_003E) / B08303_001E,
pct_commute_ten_twenty = (B08303_004E + B08303_005E) / B08303_001E,
pct_commute_twenty_thirty = (B08303_006E + B08303_007E) / B08303_001E,
pct_commute_thirty_forty = (B08303_008E + B08303_009E) / B08303_001E,
pct_commute_forty_fifty = (B08303_010E + B08303_011E) / B08303_001E,
pct_commute_fifty_plus = 1 - (
pct_commute_less_than_ten + pct_commute_ten_twenty + pct_commute_twenty_thirty + pct_commute_thirty_forty + pct_commute_forty_fifty
)
)
centroids <- st_coordinates(st_centroid(weighted_est))
weighted_est['Latitude'] <- centroids[, 2]
weighted_est['Longitude'] <- centroids[, 1]
geojson_write(cur_tbl, file = "~/../Desktop/data viz/hmwk 2/chicago.geojson", overwrite = TRUE)
geojson_write(weighted_est, file = "~/../Desktop/data viz/hmwk 2/community_areas.geojson", overwrite = TRUE)